# -*- coding:utf-8 -*-
#提取特殊字符

import re

keys=['不.?良.?鑫.?玺','违.?规.?蚊.?苯']

file_object = open('a.html')
page = file_object.read()
file_object.close()

#print page
#body_p = re.compile('<br/><br/><br/>(.*)<br/><br/><br/>', re.S)  
 #贪婪模式    '<br/><br/><br/>(.*)<br/><br/><br/>'
 #非贪婪模式  '<br/><br/><br/>(.*?)<br/><br/><br/>'
#body = re.search(body_p, page)

#print body.group().decode('utf-8').encode('gbk', 'ignore')

sections_p = re.compile('<h2>(.*?)<br/><br/><br/>', re.S)
sections = re.findall(sections_p, page)

id_sum = 0
section_id_p =  re.compile('文本编号：(.*?)</h2>', re.S)

if sections:
    for one_section in sections:
        for key in keys:
            findmsg_p =  re.compile(key, re.S)
            findmsg = re.search(findmsg_p,one_section)
            if findmsg:
                section_id = re.search(section_id_p, one_section).group().replace('文本编号：','').replace('</h2>','')
                id_sum = id_sum + int(section_id)
                print id_sum
                break

 